import pandas as pd
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import openpyxl
import tqdm
SEED = 123456789
สวรส.
# input สวรส.
DATA_DIR_HRSI = "../BERTOPIC-HEALTHY-RESERCH/DATASET/hsri.xlsx"
hsri = pd.read_excel(DATA_DIR_HRSI)
hsri.info()
print('\nNull')
hsri.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5909 entries, 0 to 5908 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 5909 non-null int64 1 itemset 5909 non-null int64 2 title 1914 non-null object 3 names 1914 non-null object 4 date 1914 non-null object 5 abstract 1913 non-null object dtypes: int64(2), object(4) memory usage: 277.1+ KB Null
index 0 itemset 0 title 3995 names 3995 date 3995 abstract 3996 dtype: int64
# input bmc
DATA_DIR_BMC = "../BERTOPIC-HEALTHY-RESERCH/DATASET/bmc.xlsx"
bmc = pd.read_excel(DATA_DIR_BMC)
bmc.info()
print('\nNull')
bmc.isnull().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1461 entries, 0 to 1460 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 1461 non-null int64 1 pages 1461 non-null int64 2 title 1429 non-null object 3 names 1429 non-null object 4 date 1429 non-null object 5 abstract 1429 non-null object 6 url 1461 non-null object dtypes: int64(2), object(5) memory usage: 80.0+ KB Null
index 0 pages 0 title 32 names 32 date 32 abstract 32 url 0 dtype: int64
# concatenate research abstracts from 2 sources into one dataframe.
df = pd.concat([hsri,bmc]).iloc[:,2:6]
df['abstract'] = df['abstract'].astype(str).apply(openpyxl.utils.escape.unescape)
df
| title | names | date | abstract | |
|---|---|---|---|---|
| 0 | NaN | NaN | NaN | nan |
| 1 | Pufferfish Poisoning | เปี่ยมศักดิ์ เมนุเศวต | 2550 | Pufferfish belong to two families of marine an... |
| 2 | A Study of a Refill Prescription Service Syste... | ระพีพรรณ ฉลองสุข | 2550 | The refilling of prescriptions for patients wi... |
| 3 | Evaluation of the Impact of Local Wisdom on Sa... | วิทยา เมฆขำ | 2550 | A study was carried out to evaluate the impact... |
| 4 | Looking at Health Promotion and Disease Preven... | ประคิณ สุจฉายา | 2550 | Children under five years old normally grow an... |
| ... | ... | ... | ... | ... |
| 1456 | A contingent valuation study to estimate the p... | Amin Mo | 24 June 2004 | We used contingent valuation technique to esti... |
| 1457 | Readiness to change physical activity and diet... | Taylor Wendell C | 10 June 2004 | BackgroundComplementary or discrepant stages o... |
| 1458 | "Harnessing genomics to improve health in Indi... | Acharya Tara | 19 May 2004 | BackgroundThe benefits of scientific medicine ... |
| 1459 | The utilisation of health research in policy-m... | Hanney Stephen R | 13 January 2003 | The importance of health research utilisation ... |
| 1460 | Assessing capacity for health policy and syste... | Gonzalez Block Miguel A | 13 January 2003 | BackgroundAs demand grows for health policies ... |
7370 rows × 4 columns
df.isnull().sum()
title 4027 names 4027 date 4027 abstract 0 dtype: int64
# drop null
df = df.dropna()
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'> Index: 3343 entries, 1 to 1460 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 3343 non-null object 1 names 3343 non-null object 2 date 3343 non-null object 3 abstract 3343 non-null object dtypes: object(4) memory usage: 130.6+ KB
| title | names | date | abstract | |
|---|---|---|---|---|
| 1 | Pufferfish Poisoning | เปี่ยมศักดิ์ เมนุเศวต | 2550 | Pufferfish belong to two families of marine an... |
| 2 | A Study of a Refill Prescription Service Syste... | ระพีพรรณ ฉลองสุข | 2550 | The refilling of prescriptions for patients wi... |
| 3 | Evaluation of the Impact of Local Wisdom on Sa... | วิทยา เมฆขำ | 2550 | A study was carried out to evaluate the impact... |
| 4 | Looking at Health Promotion and Disease Preven... | ประคิณ สุจฉายา | 2550 | Children under five years old normally grow an... |
| 6 | Over-crowding Problems in Hospitals | สุพัตรา ศรีวณิชชากร | 2550 | Overcrowding in hospitals’ out-patient sectors... |
#word_tokenize => delete stop_words
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
# func. delete stopwords
def clear_stopword(sentence_list):
return [i for i in sentence_list if i not in stopwords.words('english')]
# define all bstract to list
abstracts = []
for sentence in df['abstract'].values:
abstracts.append(sentence.lower())
# tokenize words & delete stopwords
abstracts = [' '.join(clear_stopword(word_tokenize(i))) for i in abstracts]
# define device is "GPU"
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
cuda
from sentence_transformers import SentenceTransformer
# embedding abstract
sentence_model = SentenceTransformer("all-MiniLM-L6-v2", device=device) # paraphrase-multilingual-MiniLM-L12-v2
embeddings = sentence_model.encode(abstracts, batch_size=64, show_progress_bar=True)
C:\Users\acer\AppData\Local\Programs\Python\Python310\lib\site-packages\tqdm\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm Batches: 100%|█████████████████████████████████████████████████████████████████████████| 53/53 [00:24<00:00, 2.17it/s]
embeddings.shape
(3343, 384)
from umap import UMAP
# defind UMAP model
# use UMAP to reduce dimensions of the embeddings data from kernel [10] : (3343,384) --> (3343,2)
n_neighbors = 60
min_dist = 0.1
umap_model = UMAP(n_neighbors=n_neighbors, n_components=2, min_dist=min_dist, metric='cosine',verbose=False,random_state=SEED)
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
# model vectorize abstract
vectorizer_model = CountVectorizer(token_pattern="\S+",)
# define bertopic model
topic_model = BERTopic(#embedding_model=sentence_model,
umap_model=umap_model,
vectorizer_model=vectorizer_model,
language='english',#"thai",
#representation_model=representation_model,
verbose=0,)
#min_topic_size=50)
topics, probs = topic_model.fit_transform(abstracts,embeddings)
# get first 10 topic
token_word = topic_model.get_topic_info()
token_word.iloc[:12,:3]
| Topic | Count | Name | |
|---|---|---|---|
| 0 | -1 | 961 | -1_health_study_care_system |
| 1 | 0 | 1160 | 0_research_health_evidence_policy |
| 2 | 1 | 84 | 1_health_primary_care_service |
| 3 | 2 | 69 | 2_diabetes_diabetic_patients_blood |
| 4 | 3 | 68 | 3_drug_drugs_price_pharmaceutical |
| 5 | 4 | 65 | 4_cost_hospitals_hospital_per |
| 6 | 5 | 62 | 5_covid19_care_people_infection |
| 7 | 6 | 58 | 6_scheme_coverage_health_care |
| 8 | 7 | 55 | 7_hiv_policy_research_sexual |
| 9 | 8 | 50 | 8_tb_tuberculosis_patients_treatment |
| 10 | 9 | 50 | 9_water_farmers_environmental_pollution |
| 11 | 10 | 49 | 10_elderly_care_older_welfare |
fig = topic_model.visualize_topics(width=1920, height=1080)
fig.write_html("de-topic.html")
topic_model.visualize_barchart()
# transform embeddings data (data from kernel [9])
project_emb_umap = umap_model.fit_transform(embeddings)
topic_model.visualize_documents(abstracts, reduced_embeddings=project_emb_umap, custom_labels=True)
plt.title(f'UMAP Projected Embeddings of abstract HSRI. & BMC')
plt.scatter(project_emb_umap[:, 0], project_emb_umap[:, 1], s=1, alpha=0.2)
plt.show()
https://github.com/poloclub/wizmap.git
import wizmap
xs = project_emb_umap[:,0].astype(float).tolist()
ys = project_emb_umap[:,1].astype(float).tolist()
# generate data and grid prepare before using wizmap
data_list = wizmap.generate_data_list(xs, ys, df['abstract'].values)
grid_dict = wizmap.generate_grid_dict(xs, ys, df['abstract'].values, 'abstracts of healthy system research')
# Save the JSON files
wizmap.save_json_files(data_list, grid_dict, output_dir='../BERTOPIC-HEALTHY-RESERCH/DATASET/WIZMAP_DATASET/HSRI_BMC/')
Start generating data list... Start generating contours... Start generating multi-level summaries...
3343it [00:00, 145406.60it/s] 100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00, 1.70it/s]
from IPython.display import IFrame
"""
web demo : https://poloclub.github.io/wizmap
"""
# i store grid.json & data.ndjson in github
grid_url = 'https://raw.githubusercontent.com/Dont-HurtMe/Visualization-Topic-model/main/DATASET/DATA_WIZMAP/grid.json'
data_url = 'https://raw.githubusercontent.com/Dont-HurtMe/Visualization-Topic-model/main/DATASET/DATA_WIZMAP/data.ndjson'
# wizmap web demo
'''
full web page : https://poloclub.github.io/wizmap/
?dataURL=https%3A%2F%2Fraw.githubusercontent.com%2FDont-HurtMe%2FVisualization-Topic-model%2Fmain%2FDATASET%2FDATA_WIZMAP%2Fdata.ndjson
&gridURL=https%3A%2F%2Fraw.githubusercontent.com%2FDont-HurtMe%2FVisualization-Topic-model%2Fmain%2FDATASET%2FDATA_WIZMAP%2Fgrid.json
'''
display_url = f'https://poloclub.github.io/wizmap/?dataURL={data_url}&gridURL={grid_url}'
# use IPython to display web demo of wizmap
IFrame(display_url, width=1470, height=720)
# use wizmap func to display.
wizmap.visualize(data_url, grid_url, height=700)